Data Overview

readr::read_csv(here("data/character_list5.csv"),
                      progress = FALSE,
                      col_types = cols(
                                    script_id = col_integer(),
                                    imdb_character_name = col_character(),
                                    words = col_integer(),
                                    gender = col_character(),
                                    age = col_character()
                                    )) %>%
  mutate(age = as.numeric(age)) -> characters_list

characters_list %>% 
  glimpse()
## Observations: 23,048
## Variables: 5
## $ script_id           <int> 280, 280, 280, 280, 280, 280, 280, 623, 62...
## $ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "fr...
## $ words               <int> 311, 873, 138, 2251, 190, 723, 1908, 328, ...
## $ gender              <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f...
## $ age                 <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58...
readr::read_csv(here("data/meta_data7.csv"),
                      progress = FALSE,
         col_types = cols(
                        script_id = col_integer(),
                        imdb_id = col_character(),
                        title = col_character(),
                        year = col_integer(),
                        gross = col_integer(),
                        lines_data = col_character()
                        )) %>%
  mutate(title = iconv(title,"latin1", "UTF-8")) -> meta_data

meta_data %>%
  glimpse()
## Observations: 2,000
## Variables: 6
## $ script_id  <int> 1534, 1512, 1514, 1517, 1520, 6537, 3778, 623, 1525...
## $ imdb_id    <chr> "tt1022603", "tt0147800", "tt0417385", "tt2024544",...
## $ title      <chr> "(500) Days of Summer", "10 Things I Hate About You...
## $ year       <int> 2009, 1999, 2005, 2013, 2010, 2007, 1992, 2001, 200...
## $ gross      <int> 37, 65, NA, 60, 20, 91, 15, 37, 74, 80, 376, 192, 9...
## $ lines_data <chr> "74354452567747744433425777756577444344445644567454...

Combinando Dados Originais

left_join(characters_list, 
          meta_data, 
          by=c("script_id")) %>%
  group_by(title, year) %>%
  drop_na(gross) %>%
  ungroup() -> scripts_data

scripts_data %>%
  glimpse()
## Observations: 19,387
## Variables: 10
## $ script_id           <int> 280, 280, 280, 280, 280, 280, 280, 623, 62...
## $ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "fr...
## $ words               <int> 311, 873, 138, 2251, 190, 723, 1908, 328, ...
## $ gender              <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f...
## $ age                 <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58...
## $ imdb_id             <chr> "tt0112579", "tt0112579", "tt0112579", "tt...
## $ title               <chr> "The Bridges of Madison County", "The Brid...
## $ year                <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, ...
## $ gross               <int> 142, 142, 142, 142, 142, 142, 142, 37, 37,...
## $ lines_data          <chr> "43320234343434432034334343344334343434344...
scripts_data %>%
  mutate(fem_words = ifelse(gender == "f",words,0),
         man_words = ifelse(gender == "m",words,0)) %>%
  group_by(title, year) %>%
  mutate(total_fem_words = sum(fem_words),
         total_man_words = sum(man_words)) %>%
  filter(total_fem_words !=  0) %>%
  filter(total_man_words !=  0) %>%
    mutate(f_m_ratio = sum(gender == "f")/sum(gender == "m"),
           f_m_wordratio = total_fem_words/total_man_words) %>%
  ungroup()  -> scripts_data

scripts_data %>%
  select(title,
         year,
         f_m_ratio,
         f_m_wordratio) %>%
  sample_n(10)
summary(scripts_data$f_m_wordratio)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.00722   0.16010   0.30850   0.68080   0.60660 153.80000

Female/Male Word Ratio

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x=f_m_wordratio,
             y=(..count..)/sum(..count..))) +
  geom_histogram(binwidth = 1,
                 boundary = 0,
                 fill = "grey",
                 color = "black")

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x="", 
             y=words)) +
  geom_violin(fill="grey",
               width=0.5)

Female/Male Ratio

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x=f_m_ratio,
             y=(..count..)/sum(..count..))) +
  geom_histogram(binwidth = 0.1,
                 boundary = 0,
                 fill = "grey",
                 color = "black")

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x="", 
             y=f_m_ratio)) +
  geom_violin(fill="grey",
               width=0.5)

Movie Year

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x=year)) +
  geom_bar(fill = "grey",
           color = "black")

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x="", 
             y=year)) +
  geom_violin(fill="grey",
               width=0.5)

Gross

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x=gross,
             y=(..count..)/sum(..count..))) +
  geom_histogram(binwidth = 50,
                 boundary = 0,
                 fill = "grey",
                 color = "black")

scripts_data %>%
  group_by(title,year) %>%
  unique() %>%
  ggplot(aes(x="", 
             y=gross)) +
  geom_violin(fill="grey",
               width=0.5)

Scaling Data

scripts_data %>%
  group_by(title) %>%
  slice(1) %>%
  unique() %>%
  ungroup() %>%
  select(title,
         gross,
         f_m_ratio,
         f_m_wordratio) -> data

select(data, -title) %>%
mutate_all(funs(scale)) -> scaled_data

scaled_data %>% 
  sample_n(10)

Número K ótimo

GAP statistic

A GAP compara a solução do agrupamento com cada k com a solução em um dataset onde não há estrutura de grupos.

plot_clusgap = function(clusgap, title="Gap Statistic calculation results"){
    require("ggplot2")
    gstab = data.frame(clusgap$Tab, k=1:nrow(clusgap$Tab))
    p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size=5)
    p = p + geom_errorbar(aes(ymax=gap+SE.sim, ymin=gap-SE.sim), width = .2)
    p = p + ggtitle(title)
    return(p)
}
gaps <- scaled_data %>% 
    clusGap(FUN = kmeans,
            nstart = 20,
            K.max = 8,
            B = 200,
            iter.max=30)
plot_clusgap(gaps)

  • 3 ou 6 grupos parece apropiado, mas como 6 é precedido por uma série de quedas 3 seria uma melhor opção.

Elbow Method

set.seed(123)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15

wss <- sapply(1:k.max, 
              function(k){kmeans(scaled_data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

  • Pelo Elbow method 3 parece ser um bom número de grupos devido à queda de 3 para 4.

Bayesian Information Criterion

  • Visualmente K= 2 e K = 3 representam o ganho mais significativo em termos de BIC (Bayesian Information Criterion)

Hubert Index e D Index

nb <- NbClust(scaled_data, diss=NULL, distance = "euclidean", 
              min.nc=2, max.nc=5, method = "kmeans", 
              index = "all", alphaBeale = 0.1)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 8 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 8 proposed 5 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))

  • O índice de Hubert e o índice D sugerem K = 3 como a melhor solução

K-Means


Clustering

n_clusters = 3

scaled_data %>%
    kmeans(n_clusters, iter.max = 100, nstart = 20) -> km

p <- autoplot(km, data=scaled_data, frame = TRUE)  

ggplotly(p)
  • É possível ver os 3 grupos nitidamente distintos, por meio de um zoom percebe-se que embora o grupo 1 e o grupo 3 estejam próximos o overlap é basicamente inexistente.
row.names(scaled_data) <- data$title

toclust <- scaled_data %>% 
    rownames_to_column(var = "title") 

km = toclust %>% 
    select(-title) %>% 
    kmeans(centers = n_clusters, iter.max = 100, nstart = 20)

km %>% 
    augment(toclust) %>% 
    gather(key = "variável", value = "valor", -title, -.cluster) %>% 
    ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) + 
    geom_point(alpha = 0.2) + 
    geom_line(alpha = .5) + 
    facet_wrap(~ .cluster) +
    coord_flip()

  • Grupo 1 Yes, We Can
    • Menor Faturamento
    • Mais dialógo para as mulheres
    • Maior taxa de personagens femininos
  • Grupo 2 - It’s a man’s world
    • Maior faturamento entre todos
    • Menor taxa de dialógo para as mulheres
    • Menor taxa de personagens femininos
  • Grupo 3 - Sitting on the Fence
    • Mediano em termos de dialógo, personagens e faturamento

Silhouette

dists = scaled_data %>% 
  dist()

scaled_data %>%
    kmeans(3, iter.max = 100, nstart = 20) -> km


silhouette(km$cluster, dists) %>%
   plot(col = RColorBrewer::brewer.pal(4, "Set2"),border=NA)

  • O valor de 0.6 da silhueta significa que a nossa clusterização foi razoável.